set.seed(1)
project_2_data = 
  read_csv("project_2_data.csv", na = c("NA", "", ".")) |>
  janitor::clean_names()  |>
  mutate(status = ifelse(status == "Dead", 1, 0))
## Rows: 4024 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): Race, Marital Status, T Stage, N Stage, 6th Stage, differentiate, ...
## dbl  (5): Age, Tumor Size, Regional Node Examined, Reginol Node Positive, Su...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dead <- project_2_data |>
  filter(status == 1)

alive <- project_2_data |>
  filter(status == 0) |>
  sample_n(nrow(dead))

project_2_numdata<-
  bind_rows(dead, alive) |>
  mutate(
    t_stage = case_when(
    t_stage == "T1" ~ 1,
    t_stage == "T2" ~ 2,
    t_stage == "T3" ~ 3,
    t_stage == "T4" ~ 4,
    TRUE ~ NA_real_),
    n_stage = case_when(
    n_stage == "N1" ~ 1,
    n_stage == "N2" ~ 2,
    n_stage == "N3" ~ 3,
    TRUE ~ NA_real_),
    differentiate = case_when(
    differentiate == "Well differentiated" ~ 1,
    differentiate == "Moderately differentiated" ~ 2,
    differentiate == "Poorly differentiated" ~ 3,
    differentiate == "Undifferentiated" ~ 4,
    TRUE ~ NA_real_),
    grade = case_when(
    grade == "anaplastic; Grade IV" ~ 4,
    grade == "3" ~ 3,
    grade == "2" ~ 2,
    grade == "1" ~ 1,
    TRUE ~ NA_real_),
    a_stage_regional = ifelse(a_stage == "Regional", 1, 0),
    estrogen_status = ifelse(estrogen_status == "Positive", 1, 0),
    progesterone_status = ifelse(progesterone_status == "Positive", 1, 0)
    ) |>
  select(-a_stage)
variables <- names(project_2_numdata)[!names(project_2_numdata) %in% c("survival_months", "status")]
for (var in variables) {
  formula <- as.formula(paste("Surv(survival_months, status) ~", var))
  model <- coxph(formula, data = project_2_numdata)
  print(var)
  print(summary(model))
}
## [1] "age"
## Call:
## coxph(formula = formula, data = project_2_numdata)
## 
##   n= 1232, number of events= 616 
## 
##         coef exp(coef) se(coef)     z Pr(>|z|)   
## age 0.013313  1.013402 0.004454 2.989   0.0028 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##     exp(coef) exp(-coef) lower .95 upper .95
## age     1.013     0.9868     1.005     1.022
## 
## Concordance= 0.536  (se = 0.012 )
## Likelihood ratio test= 9.05  on 1 df,   p=0.003
## Wald test            = 8.93  on 1 df,   p=0.003
## Score (logrank) test = 8.95  on 1 df,   p=0.003
## 
## [1] "race"
## Call:
## coxph(formula = formula, data = project_2_numdata)
## 
##   n= 1232, number of events= 616 
## 
##              coef exp(coef) se(coef)      z Pr(>|z|)   
## raceOther -0.5965    0.5508   0.2099 -2.842  0.00448 **
## raceWhite -0.3263    0.7216   0.1252 -2.607  0.00914 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##           exp(coef) exp(-coef) lower .95 upper .95
## raceOther    0.5508      1.816    0.3650    0.8310
## raceWhite    0.7216      1.386    0.5646    0.9222
## 
## Concordance= 0.527  (se = 0.008 )
## Likelihood ratio test= 9.46  on 2 df,   p=0.009
## Wald test            = 9.81  on 2 df,   p=0.007
## Score (logrank) test = 9.92  on 2 df,   p=0.007
## 
## [1] "marital_status"
## Call:
## coxph(formula = formula, data = project_2_numdata)
## 
##   n= 1232, number of events= 616 
## 
##                             coef exp(coef) se(coef)      z Pr(>|z|)  
## marital_statusMarried   -0.23876   0.78761  0.11794 -2.024   0.0429 *
## marital_statusSeparated  0.53873   1.71383  0.27908  1.930   0.0536 .
## marital_statusSingle    -0.10841   0.89726  0.14404 -0.753   0.4517  
## marital_statusWidowed    0.04877   1.04998  0.17757  0.275   0.7836  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                         exp(coef) exp(-coef) lower .95 upper .95
## marital_statusMarried      0.7876     1.2697    0.6251    0.9924
## marital_statusSeparated    1.7138     0.5835    0.9918    2.9616
## marital_statusSingle       0.8973     1.1145    0.6766    1.1899
## marital_statusWidowed      1.0500     0.9524    0.7414    1.4871
## 
## Concordance= 0.537  (se = 0.011 )
## Likelihood ratio test= 12.55  on 4 df,   p=0.01
## Wald test            = 14  on 4 df,   p=0.007
## Score (logrank) test = 14.36  on 4 df,   p=0.006
## 
## [1] "t_stage"
## Call:
## coxph(formula = formula, data = project_2_numdata)
## 
##   n= 1232, number of events= 616 
## 
##            coef exp(coef) se(coef)     z Pr(>|z|)    
## t_stage 0.38440   1.46874  0.04713 8.156 3.48e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##         exp(coef) exp(-coef) lower .95 upper .95
## t_stage     1.469     0.6809     1.339     1.611
## 
## Concordance= 0.582  (se = 0.011 )
## Likelihood ratio test= 62.94  on 1 df,   p=2e-15
## Wald test            = 66.51  on 1 df,   p=3e-16
## Score (logrank) test = 67.36  on 1 df,   p=2e-16
## 
## [1] "n_stage"
## Call:
## coxph(formula = formula, data = project_2_numdata)
## 
##   n= 1232, number of events= 616 
## 
##            coef exp(coef) se(coef)    z Pr(>|z|)    
## n_stage 0.61870   1.85652  0.04835 12.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##         exp(coef) exp(-coef) lower .95 upper .95
## n_stage     1.857     0.5386     1.689     2.041
## 
## Concordance= 0.623  (se = 0.011 )
## Likelihood ratio test= 152  on 1 df,   p=<2e-16
## Wald test            = 163.7  on 1 df,   p=<2e-16
## Score (logrank) test = 175.6  on 1 df,   p=<2e-16
## 
## [1] "x6th_stage"
## Call:
## coxph(formula = formula, data = project_2_numdata)
## 
##   n= 1232, number of events= 616 
## 
##                  coef exp(coef) se(coef)      z Pr(>|z|)    
## x6th_stageIIB  0.4390    1.5512   0.1336  3.287  0.00101 ** 
## x6th_stageIIIA 0.7669    2.1530   0.1260  6.088 1.15e-09 ***
## x6th_stageIIIB 1.4792    4.3893   0.2464  6.003 1.94e-09 ***
## x6th_stageIIIC 1.5199    4.5718   0.1274 11.934  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                exp(coef) exp(-coef) lower .95 upper .95
## x6th_stageIIB      1.551     0.6447     1.194     2.015
## x6th_stageIIIA     2.153     0.4645     1.682     2.756
## x6th_stageIIIB     4.389     0.2278     2.708     7.115
## x6th_stageIIIC     4.572     0.2187     3.562     5.868
## 
## Concordance= 0.637  (se = 0.012 )
## Likelihood ratio test= 171.5  on 4 df,   p=<2e-16
## Wald test            = 177.7  on 4 df,   p=<2e-16
## Score (logrank) test = 198.8  on 4 df,   p=<2e-16
## 
## [1] "differentiate"
## Call:
## coxph(formula = formula, data = project_2_numdata)
## 
##   n= 1232, number of events= 616 
## 
##                  coef exp(coef) se(coef)     z Pr(>|z|)    
## differentiate 0.44124   1.55463  0.06433 6.859 6.94e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##               exp(coef) exp(-coef) lower .95 upper .95
## differentiate     1.555     0.6432      1.37     1.764
## 
## Concordance= 0.574  (se = 0.011 )
## Likelihood ratio test= 48.05  on 1 df,   p=4e-12
## Wald test            = 47.04  on 1 df,   p=7e-12
## Score (logrank) test = 47.37  on 1 df,   p=6e-12
## 
## [1] "grade"
## Call:
## coxph(formula = formula, data = project_2_numdata)
## 
##   n= 1232, number of events= 616 
## 
##          coef exp(coef) se(coef)     z Pr(>|z|)    
## grade 0.44124   1.55463  0.06433 6.859 6.94e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##       exp(coef) exp(-coef) lower .95 upper .95
## grade     1.555     0.6432      1.37     1.764
## 
## Concordance= 0.574  (se = 0.011 )
## Likelihood ratio test= 48.05  on 1 df,   p=4e-12
## Wald test            = 47.04  on 1 df,   p=7e-12
## Score (logrank) test = 47.37  on 1 df,   p=6e-12
## 
## [1] "tumor_size"
## Call:
## coxph(formula = formula, data = project_2_numdata)
## 
##   n= 1232, number of events= 616 
## 
##                coef exp(coef) se(coef)    z Pr(>|z|)    
## tumor_size 0.011933  1.012005 0.001574 7.58 3.46e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## tumor_size     1.012     0.9881     1.009     1.015
## 
## Concordance= 0.592  (se = 0.012 )
## Likelihood ratio test= 50.4  on 1 df,   p=1e-12
## Wald test            = 57.45  on 1 df,   p=3e-14
## Score (logrank) test = 58.25  on 1 df,   p=2e-14
## 
## [1] "estrogen_status"
## Call:
## coxph(formula = formula, data = project_2_numdata)
## 
##   n= 1232, number of events= 616 
## 
##                    coef exp(coef) se(coef)      z Pr(>|z|)    
## estrogen_status -0.9108    0.4022   0.1064 -8.562   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                 exp(coef) exp(-coef) lower .95 upper .95
## estrogen_status    0.4022      2.486    0.3265    0.4954
## 
## Concordance= 0.565  (se = 0.008 )
## Likelihood ratio test= 60.02  on 1 df,   p=9e-15
## Wald test            = 73.31  on 1 df,   p=<2e-16
## Score (logrank) test = 78.48  on 1 df,   p=<2e-16
## 
## [1] "progesterone_status"
## Call:
## coxph(formula = formula, data = project_2_numdata)
## 
##   n= 1232, number of events= 616 
## 
##                         coef exp(coef) se(coef)      z Pr(>|z|)    
## progesterone_status -0.71314   0.49010  0.08583 -8.308   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                     exp(coef) exp(-coef) lower .95 upper .95
## progesterone_status    0.4901       2.04    0.4142    0.5799
## 
## Concordance= 0.588  (se = 0.01 )
## Likelihood ratio test= 62.82  on 1 df,   p=2e-15
## Wald test            = 69.03  on 1 df,   p=<2e-16
## Score (logrank) test = 71.97  on 1 df,   p=<2e-16
## 
## [1] "regional_node_examined"
## Call:
## coxph(formula = formula, data = project_2_numdata)
## 
##   n= 1232, number of events= 616 
## 
##                            coef exp(coef) se(coef)     z Pr(>|z|)  
## regional_node_examined 0.010506  1.010561 0.004982 2.109    0.035 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                        exp(coef) exp(-coef) lower .95 upper .95
## regional_node_examined     1.011     0.9895     1.001      1.02
## 
## Concordance= 0.521  (se = 0.013 )
## Likelihood ratio test= 4.35  on 1 df,   p=0.04
## Wald test            = 4.45  on 1 df,   p=0.03
## Score (logrank) test = 4.44  on 1 df,   p=0.04
## 
## [1] "reginol_node_positive"
## Call:
## coxph(formula = formula, data = project_2_numdata)
## 
##   n= 1232, number of events= 616 
## 
##                           coef exp(coef) se(coef)     z Pr(>|z|)    
## reginol_node_positive 0.054926  1.056462 0.004627 11.87   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                       exp(coef) exp(-coef) lower .95 upper .95
## reginol_node_positive     1.056     0.9466     1.047     1.066
## 
## Concordance= 0.627  (se = 0.012 )
## Likelihood ratio test= 106.3  on 1 df,   p=<2e-16
## Wald test            = 140.9  on 1 df,   p=<2e-16
## Score (logrank) test = 148  on 1 df,   p=<2e-16
## 
## [1] "a_stage_regional"
## Call:
## coxph(formula = formula, data = project_2_numdata)
## 
##   n= 1232, number of events= 616 
## 
##                     coef exp(coef) se(coef)      z Pr(>|z|)    
## a_stage_regional -1.1046    0.3314   0.1747 -6.324 2.55e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                  exp(coef) exp(-coef) lower .95 upper .95
## a_stage_regional    0.3314      3.018    0.2353    0.4666
## 
## Concordance= 0.522  (se = 0.005 )
## Likelihood ratio test= 29.49  on 1 df,   p=6e-08
## Wald test            = 39.99  on 1 df,   p=3e-10
## Score (logrank) test = 44.22  on 1 df,   p=3e-11

All variables are statistically significant except for marriage in “marital_status”

project_2_numdata <-
  project_2_numdata|>
  mutate(Married = ifelse(marital_status == "Married", 1, 0)) |>
  select(-differentiate, - marital_status)

cor_matrix <- cor(project_2_numdata[, sapply(project_2_numdata, is.numeric)])
print(cor_matrix)
##                                age      t_stage     n_stage       grade
## age                     1.00000000 -0.050113404 -0.01706295 -0.12321448
## t_stage                -0.05011340  1.000000000  0.30612098  0.15076838
## n_stage                -0.01706295  0.306120984  1.00000000  0.17991601
## grade                  -0.12321448  0.150768380  0.17991601  1.00000000
## tumor_size             -0.07953275  0.784997546  0.30277440  0.14403058
## estrogen_status         0.12111658 -0.106722673 -0.12700846 -0.26247316
## progesterone_status     0.01233580 -0.071974613 -0.11108594 -0.21201525
## regional_node_examined -0.03141638  0.088442919  0.36808270  0.07651544
## reginol_node_positive   0.01628318  0.244266968  0.82874594  0.15163577
## survival_months        -0.08599693 -0.159716020 -0.24737291 -0.15052637
## status                  0.06976636  0.225250913  0.34556514  0.19151006
## a_stage_regional        0.04777171 -0.234675561 -0.25575034 -0.04874199
## Married                -0.04796418  0.003435462 -0.05151826 -0.02717180
##                         tumor_size estrogen_status progesterone_status
## age                    -0.07953275     0.121116575          0.01233580
## t_stage                 0.78499755    -0.106722673         -0.07197461
## n_stage                 0.30277440    -0.127008462         -0.11108594
## grade                   0.14403058    -0.262473159         -0.21201525
## tumor_size              1.00000000    -0.123020668         -0.08438705
## estrogen_status        -0.12302067     1.000000000          0.58504596
## progesterone_status    -0.08438705     0.585045962          1.00000000
## regional_node_examined  0.11473470    -0.041404301         -0.01390137
## reginol_node_positive   0.24878553    -0.100873585         -0.07105526
## survival_months        -0.16151178     0.227274035          0.21200308
## status                  0.19535827    -0.175795073         -0.20001424
## a_stage_regional       -0.12796649     0.127990099          0.07311096
## Married                -0.01036450     0.008579762         -0.01351800
##                        regional_node_examined reginol_node_positive
## age                               -0.03141638            0.01628318
## t_stage                            0.08844292            0.24426697
## n_stage                            0.36808270            0.82874594
## grade                              0.07651544            0.15163577
## tumor_size                         0.11473470            0.24878553
## estrogen_status                   -0.04140430           -0.10087358
## progesterone_status               -0.01390137           -0.07105526
## regional_node_examined             1.00000000            0.49395643
## reginol_node_positive              0.49395643            1.00000000
## survival_months                   -0.03087185           -0.21263747
## status                             0.06199862            0.31134369
## a_stage_regional                  -0.04237809           -0.19505294
## Married                            0.02581466           -0.03581806
##                        survival_months      status a_stage_regional
## age                        -0.08599693  0.06976636       0.04777171
## t_stage                    -0.15971602  0.22525091      -0.23467556
## n_stage                    -0.24737291  0.34556514      -0.25575034
## grade                      -0.15052637  0.19151006      -0.04874199
## tumor_size                 -0.16151178  0.19535827      -0.12796649
## estrogen_status             0.22727403 -0.17579507       0.12799010
## progesterone_status         0.21200308 -0.20001424       0.07311096
## regional_node_examined     -0.03087185  0.06199862      -0.04237809
## reginol_node_positive      -0.21263747  0.31134369      -0.19505294
## survival_months             1.00000000 -0.58302039       0.13738966
## status                     -0.58302039  1.00000000      -0.13123516
## a_stage_regional            0.13738966 -0.13123516       1.00000000
## Married                     0.06161119 -0.08197981       0.03226076
##                             Married
## age                    -0.047964181
## t_stage                 0.003435462
## n_stage                -0.051518259
## grade                  -0.027171798
## tumor_size             -0.010364496
## estrogen_status         0.008579762
## progesterone_status    -0.013518004
## regional_node_examined  0.025814656
## reginol_node_positive  -0.035818059
## survival_months         0.061611186
## status                 -0.081979813
## a_stage_regional        0.032260758
## Married                 1.000000000
cortable<-project_2_numdata|>
  select(-race,-x6th_stage)
cortable$regional_node_examined
##    [1]  9  9 12 15 16 23 24  8  5 21  4  3 33  7 11  3 16  8 28 20 14 23 20 13
##   [25] 15 14 20 17 11 23  3 34  1 17 17  4 12 27 14 20 28 29 18  3  4 13 23 18
##   [49]  7 41 11 10 27 19  5 24  2 12 16 15 40  9 10 19 17  3 19 16  7 25  1  9
##   [73] 10 15  9 11 29 21 17 17  7 15 22 47 22 14  9  9 13 14 10 19 15  6  2 54
##   [97]  5 31 28 21 19  8 13 13  9 18 12 15 27 24  5  7 22 17 16 12  9 23 10  6
##  [121] 19  5 15 10 30 18 15 27 14  3 36 16  5  9  3 32 31 21 18 10  3 20 10 25
##  [145]  9 14  9 29 32 19 27  1 11 13 14 21 11 15 16 22  2 14 17  5 10 23 10  3
##  [169] 17 18 18 23  9  9  8 23 19 12  9 26 12 15 23  9 14 22 16 30  4  2 10 13
##  [193]  1 10  4 15  7  5 28 12 18 28 18 12  6 18 26  3 21 36 26 18 18 28 16 30
##  [217]  2  1 21 14 21  4 10 16 19 15 29 15 34 23 27 17 13 12  3 13 14 36 11 12
##  [241]  9  9  5 13 11 20 10 14 16 47 15 16 29 16  9  7 11  6 16 18 22 18  4 25
##  [265] 29  1 13 13  9 21 25 10  2  8 21  2 26 17  4  9 13 16 10 13  1  6  9  9
##  [289] 18 22  8 15  8  4  4 18 12 17 15 19 16 27 15 10 14 13 12 11  8 14 21 13
##  [313] 17  4 19 12 17  7 25 37  8 24 13 22 15  3 13  3  2 19 12 11 11  2 20 12
##  [337] 19 11 15 12 21 10 10 17  3 15 17 15 17 11  8 18 21 18 10 29  9 18 13 20
##  [361] 35 23 12  7 32 18 18  5 28 10 10 18 11  9  9  1 30  3  7  9 19 57  5 22
##  [385] 12 15 12  3  8 15 25 10  4 26 25 16 23 13 17  8 24  9 10  9 36 19 12  8
##  [409] 21  8 10 30 17 11 12 20 22 14  4 11  1  4 18 20 12 19 15 21 16 18 11 16
##  [433] 17  5 16 23  7  5 19  9 22 22  8 10 19 11 11  7 12  8 13 12 24 12 16 16
##  [457] 20 13  3 30 22 35 11 10  7 30 29 14 12 17  2 24 15 10  9 12  4 19 23 28
##  [481] 17 14 27 12 16 11 19 22 16  8  9 10 20 15  4 24 19 18 12  4  9 17 22  2
##  [505] 11  6  2 16 21 20 12 17 21  7 15 13  6  4  9 30  2 29 13  3  8  5 14  6
##  [529] 15  1 14 11 21 34 11 18 13  7 11 11  4 16 19 20 12 13 30 19 17 12 12 15
##  [553] 12  9 12  6  8  2 12 28 14 24 24  4 26  7 18  8 18 13 22  9 28 21 10 13
##  [577]  6 16 12  2 15 16 17 17 23 25 47 21 19 18 18  3 31 11  2 16 14 16 23 13
##  [601] 14  7 10  7 16 25 28 10 16 22 23 21 21 19  6  2  8  6 12  2  5 14 15  8
##  [625]  3 23 19 19 15  7  7 14 10 10 14 31 17 25  7 12 15 12  4 17 19 24 14 24
##  [649]  6 25  6 19 34 16  3  7 15 19  2 26  9 21  2  9 13 10 17 13  4 15  5 13
##  [673] 21  5 18  6 21 11 18 11  7  4  5  5 13 18 17  5  7 14 23 11 15 13 13 10
##  [697] 43 47 20 14  9 26  7 12 28  1  5 18 18 12 11 20 41 23 10 20 16 21 26 12
##  [721]  6 17 12 11 20 22 26 10  8 15 11  4 16 13 21 14 17  9 10  8  3 12 22 37
##  [745] 18  7 11 23 13 25 12 17 29  1 12  7  1 23  4 10 25 13  1  9  9 11 26  9
##  [769] 18  3  8 24  9 14 24 18  2 15 24  9 23 27 16 15 18 16 17 25 15 15 18 15
##  [793] 12  2 13 20  9 13 16 15 15  9 23 13  2 20 19 29 22  2 15  9 23  7 12  3
##  [817] 13  1 31 15 20 12 18 10 15  7 23 24 13  8 12 13 20  7 22  4 11 14 16 17
##  [841] 10 13 11 19 17 15 25 15 19  9 20 16 14  6 14 11 13 16  7 17 16 13  5 35
##  [865]  1 20 20  5  5 26 15 15 16 20 13 14  7 15 27 22 30  4 34  1 20 14 18 10
##  [889] 16  8 29  4  9  8  3 10 25 18 20  9 17 28 15 20  3 18  9 21 14  7  1 12
##  [913] 15  6 17 35 13 25 18  8 16 26  5  8  9 25 27 13  3 16 29  6 14  3 17 16
##  [937] 17 32  8 16  3  2 16 20 24 13 19 19 11  3 14 12  8  6  1 23  7  2  4 13
##  [961] 12  6 37 15 17  2 14 11  3 12 27  6 19  4 19 12 14 18 15 12 20 10 21  8
##  [985] 17 17  8 20 15 10  6 21  6  8 17 24  7 13 14 27 15 23 15 11  1 24 13 26
## [1009] 16 18  5 16 28 19 12 14 14 23 17 18 31 12 11  9 20 16 18 18  2 13  4 14
## [1033] 12 13  4  3  2  8 11  3  2  2  2 13 10  1 16 12 18  5 28 16 13 15  7 22
## [1057] 17 11 10 16 14  9  8 16 15  6 18 14 15 14  3 22 38  3  5 17  9 23 14 13
## [1081]  9 24  9  9  6 16 18 17 10 24  8 27  7 22 10 12 28 14  9  7 17 26 11 23
## [1105]  9 11 12 32 32  6 13 16  8 19 37  3 18 14 10 21 13 12 15  9  6  7  9 13
## [1129] 31 13 18  4  6 18 12  3 16 11  3 29 26 14  9  8 29 18 27 11 20 19  9 16
## [1153] 13  3  9  2 12 10  7 12 25 26 18 14 17 18  9  9 12 13 14 18 30 14  6 15
## [1177]  2 19 16 14  9 10  9  5  7 11  4 14 10  4  9 12 12  3 16 16 17 24  9 11
## [1201] 16 19 11 14  3  4  5  5 17 12 16 22 11 10  8 18 21 10  4 25  3 19 14 16
## [1225] 15 16 16 11 18 13 11 12
chart.Correlation(cortable, histogram=TRUE, pch=19)
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter

We have to choice 1 between t_stage - tumor size n stage - regional_node_positive

muldata = project_2_numdata
  
  
cox_model <- coxph(Surv(survival_months, status) ~.,
                   data = muldata)

vifs <- vif(cox_model)  
## Warning in vif.default(cox_model): No intercept: vifs may not be sensible.
print(vifs)
##                             GVIF Df GVIF^(1/(2*Df))
## age                     1.078511  1        1.038514
## race                    1.089620  2        1.021689
## t_stage                 4.204881  1        2.050581
## n_stage                16.052408  1        4.006546
## x6th_stage             42.723885  4        1.598946
## grade                   1.132422  1        1.064153
## tumor_size              2.891806  1        1.700531
## estrogen_status         1.705080  1        1.305787
## progesterone_status     1.579316  1        1.256708
## regional_node_examined  1.766667  1        1.329160
## reginol_node_positive   4.281220  1        2.069111
## a_stage_regional        1.337110  1        1.156335
## Married                 1.094899  1        1.046374

remove t-stage,n_stage and regional_mode_positive due to higher VIF score

final = muldata |>
  select(-t_stage, -n_stage,-reginol_node_positive)
  
cox_model1 <- coxph(Surv(survival_months, status) ~.,
                   data = final)
vifs <- vif(cox_model1)
## Warning in vif.default(cox_model1): No intercept: vifs may not be sensible.
print(vifs)
##                            GVIF Df GVIF^(1/(2*Df))
## age                    1.065375  1        1.032170
## race                   1.084396  2        1.020462
## x6th_stage             2.150931  4        1.100470
## grade                  1.123660  1        1.060028
## tumor_size             1.298750  1        1.139627
## estrogen_status        1.647493  1        1.283547
## progesterone_status    1.541107  1        1.241413
## regional_node_examined 1.247171  1        1.116768
## a_stage_regional       1.327119  1        1.152007
## Married                1.093934  1        1.045913
summary(cox_model)
## Call:
## coxph(formula = Surv(survival_months, status) ~ ., data = muldata)
## 
##   n= 1232, number of events= 616 
## 
##                             coef exp(coef)  se(coef)      z Pr(>|z|)    
## age                     0.019327  1.019515  0.004705  4.108 3.99e-05 ***
## raceOther              -0.435919  0.646670  0.213742 -2.039  0.04140 *  
## raceWhite              -0.261756  0.769699  0.129732 -2.018  0.04363 *  
## t_stage                -0.027546  0.972830  0.096358 -0.286  0.77498    
## n_stage                 0.453734  1.574180  0.193949  2.339  0.01931 *  
## x6th_stageIIB           0.410796  1.508018  0.151806  2.706  0.00681 ** 
## x6th_stageIIIA          0.246022  1.278928  0.235577  1.044  0.29633    
## x6th_stageIIIB          0.836403  2.308051  0.388854  2.151  0.03148 *  
## x6th_stageIIIC          0.138145  1.148142  0.450956  0.306  0.75935    
## grade                   0.278566  1.321234  0.068832  4.047 5.19e-05 ***
## tumor_size              0.005146  1.005160  0.002796  1.841  0.06569 .  
## estrogen_status        -0.340392  0.711491  0.139382 -2.442  0.01460 *  
## progesterone_status    -0.449035  0.638244  0.108196 -4.150 3.32e-05 ***
## regional_node_examined -0.021173  0.979050  0.006570 -3.222  0.00127 ** 
## reginol_node_positive   0.027317  1.027694  0.011106  2.460  0.01391 *  
## a_stage_regional       -0.305088  0.737058  0.202176 -1.509  0.13129    
## Married                -0.114324  0.891969  0.085494 -1.337  0.18115    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                        exp(coef) exp(-coef) lower .95 upper .95
## age                       1.0195     0.9809    1.0102    1.0290
## raceOther                 0.6467     1.5464    0.4253    0.9832
## raceWhite                 0.7697     1.2992    0.5969    0.9925
## t_stage                   0.9728     1.0279    0.8054    1.1751
## n_stage                   1.5742     0.6353    1.0764    2.3022
## x6th_stageIIB             1.5080     0.6631    1.1199    2.0306
## x6th_stageIIIA            1.2789     0.7819    0.8060    2.0294
## x6th_stageIIIB            2.3081     0.4333    1.0771    4.9458
## x6th_stageIIIC            1.1481     0.8710    0.4744    2.7788
## grade                     1.3212     0.7569    1.1545    1.5121
## tumor_size                1.0052     0.9949    0.9997    1.0107
## estrogen_status           0.7115     1.4055    0.5414    0.9350
## progesterone_status       0.6382     1.5668    0.5163    0.7890
## regional_node_examined    0.9790     1.0214    0.9665    0.9917
## reginol_node_positive     1.0277     0.9731    1.0056    1.0503
## a_stage_regional          0.7371     1.3567    0.4959    1.0955
## Married                   0.8920     1.1211    0.7544    1.0547
## 
## Concordance= 0.696  (se = 0.011 )
## Likelihood ratio test= 302.6  on 17 df,   p=<2e-16
## Wald test            = 323.8  on 17 df,   p=<2e-16
## Score (logrank) test = 354.9  on 17 df,   p=<2e-16
ph_test <- cox.zph(cox_model1)
print(ph_test)
##                          chisq df       p
## age                     0.1978  1    0.66
## race                    0.9493  2    0.62
## x6th_stage              7.3617  4    0.12
## grade                   0.1293  1    0.72
## tumor_size              0.0858  1    0.77
## estrogen_status        23.1261  1 1.5e-06
## progesterone_status    19.6293  1 9.4e-06
## regional_node_examined  0.0880  1    0.77
## a_stage_regional        0.6691  1    0.41
## Married                 2.3184  1    0.13
## GLOBAL                 47.1764 14 1.8e-05
plot(ph_test) 

The Cox model assumes that hazard ratios are constant over time. A non-significant p-value (p > 0.05) indicates that the PH assumption holds. As GLOBAL 47.176 15 1.8e-05, the model did not meet the assumption, as same as estrogen_status and progesterone_status.We need further improve our model.

cox_model2 <- coxph(Surv(survival_months, status) ~ age + race + x6th_stage + grade + 
                            tumor_size + regional_node_examined + a_stage_regional+Married,
                            data = final,x = TRUE)
summary(cox_model2)
## Call:
## coxph(formula = Surv(survival_months, status) ~ age + race + 
##     x6th_stage + grade + tumor_size + regional_node_examined + 
##     a_stage_regional + Married, data = final, x = TRUE)
## 
##   n= 1232, number of events= 616 
## 
##                             coef exp(coef)  se(coef)      z Pr(>|z|)    
## age                     0.018335  1.018504  0.004590  3.994 6.49e-05 ***
## raceOther              -0.464884  0.628208  0.213238 -2.180  0.02925 *  
## raceWhite              -0.328808  0.719781  0.129044 -2.548  0.01083 *  
## x6th_stageIIB           0.408660  1.504799  0.137857  2.964  0.00303 ** 
## x6th_stageIIIA          0.716327  2.046901  0.139572  5.132 2.86e-07 ***
## x6th_stageIIIB          1.202920  3.329827  0.273778  4.394 1.11e-05 ***
## x6th_stageIIIC          1.451305  4.268680  0.155308  9.345  < 2e-16 ***
## grade                   0.368143  1.445048  0.066173  5.563 2.65e-08 ***
## tumor_size              0.002535  1.002539  0.001888  1.343  0.17931    
## regional_node_examined -0.014263  0.985838  0.005689 -2.507  0.01217 *  
## a_stage_regional       -0.332487  0.717138  0.196343 -1.693  0.09038 .  
## Married                -0.132085  0.876267  0.084540 -1.562  0.11819    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                        exp(coef) exp(-coef) lower .95 upper .95
## age                       1.0185     0.9818    1.0094    1.0277
## raceOther                 0.6282     1.5918    0.4136    0.9541
## raceWhite                 0.7198     1.3893    0.5589    0.9269
## x6th_stageIIB             1.5048     0.6645    1.1485    1.9716
## x6th_stageIIIA            2.0469     0.4885    1.5570    2.6909
## x6th_stageIIIB            3.3298     0.3003    1.9471    5.6946
## x6th_stageIIIC            4.2687     0.2343    3.1484    5.7875
## grade                     1.4450     0.6920    1.2693    1.6452
## tumor_size                1.0025     0.9975    0.9988    1.0063
## regional_node_examined    0.9858     1.0144    0.9749    0.9969
## a_stage_regional          0.7171     1.3944    0.4881    1.0537
## Married                   0.8763     1.1412    0.7425    1.0342
## 
## Concordance= 0.67  (se = 0.012 )
## Likelihood ratio test= 239.8  on 12 df,   p=<2e-16
## Wald test            = 247.9  on 12 df,   p=<2e-16
## Score (logrank) test = 271.8  on 12 df,   p=<2e-16
ph_test <- cox.zph(cox_model2)
print(ph_test)
##                           chisq df     p
## age                    2.87e-01  1 0.592
## race                   8.46e-01  2 0.655
## x6th_stage             5.72e+00  4 0.221
## grade                  3.61e-01  1 0.548
## tumor_size             6.89e-04  1 0.979
## regional_node_examined 5.56e-03  1 0.941
## a_stage_regional       1.36e+00  1 0.244
## Married                2.90e+00  1 0.089
## GLOBAL                 1.63e+01 12 0.178
plot(ph_test) 

now the assumption is been fully achieved after delete estrogen_status and progesterone_status

cox_model3 <- coxph(Surv(survival_months, status) ~ age + race + x6th_stage + grade + 
                            log2(tumor_size+1) + log2(regional_node_examined+1)                                            +a_stage_regional + Married,
                            data = final)
summary(cox_model3)
## Call:
## coxph(formula = Surv(survival_months, status) ~ age + race + 
##     x6th_stage + grade + log2(tumor_size + 1) + log2(regional_node_examined + 
##     1) + a_stage_regional + Married, data = final)
## 
##   n= 1232, number of events= 616 
## 
##                                       coef exp(coef)  se(coef)      z Pr(>|z|)
## age                               0.018234  1.018401  0.004601  3.963 7.41e-05
## raceOther                        -0.485708  0.615262  0.213037 -2.280  0.02261
## raceWhite                        -0.337849  0.713303  0.128868 -2.622  0.00875
## x6th_stageIIB                     0.365175  1.440766  0.146221  2.497  0.01251
## x6th_stageIIIA                    0.698826  2.011389  0.149210  4.684 2.82e-06
## x6th_stageIIIB                    1.168003  3.215565  0.274916  4.249 2.15e-05
## x6th_stageIIIC                    1.443291  4.234611  0.161875  8.916  < 2e-16
## grade                             0.363796  1.438780  0.066076  5.506 3.68e-08
## log2(tumor_size + 1)              0.092885  1.097335  0.052389  1.773  0.07623
## log2(regional_node_examined + 1) -0.167005  0.846196  0.051160 -3.264  0.00110
## a_stage_regional                 -0.336402  0.714336  0.195087 -1.724  0.08464
## Married                          -0.128061  0.879800  0.084592 -1.514  0.13006
##                                     
## age                              ***
## raceOther                        *  
## raceWhite                        ** 
## x6th_stageIIB                    *  
## x6th_stageIIIA                   ***
## x6th_stageIIIB                   ***
## x6th_stageIIIC                   ***
## grade                            ***
## log2(tumor_size + 1)             .  
## log2(regional_node_examined + 1) ** 
## a_stage_regional                 .  
## Married                             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                                  exp(coef) exp(-coef) lower .95 upper .95
## age                                 1.0184     0.9819    1.0093    1.0276
## raceOther                           0.6153     1.6253    0.4052    0.9341
## raceWhite                           0.7133     1.4019    0.5541    0.9183
## x6th_stageIIB                       1.4408     0.6941    1.0818    1.9189
## x6th_stageIIIA                      2.0114     0.4972    1.5014    2.6947
## x6th_stageIIIB                      3.2156     0.3110    1.8761    5.5115
## x6th_stageIIIC                      4.2346     0.2361    3.0834    5.8157
## grade                               1.4388     0.6950    1.2640    1.6377
## log2(tumor_size + 1)                1.0973     0.9113    0.9903    1.2160
## log2(regional_node_examined + 1)    0.8462     1.1818    0.7655    0.9354
## a_stage_regional                    0.7143     1.3999    0.4874    1.0470
## Married                             0.8798     1.1366    0.7454    1.0385
## 
## Concordance= 0.672  (se = 0.012 )
## Likelihood ratio test= 245  on 12 df,   p=<2e-16
## Wald test            = 251.2  on 12 df,   p=<2e-16
## Score (logrank) test = 275.3  on 12 df,   p=<2e-16

Add log transformation to the tumor size and regional node_examined, now they are being statistically significant

cox_model4 <- coxph(Surv(survival_months, status) ~ age + race + x6th_stage + grade + 
                            log2(tumor_size+1) + log2(regional_node_examined+1) +                                          +a_stage_regional+ Married + Married*log2(regional_node_examined+1),
                            data = final)
summary(cox_model4)
## Call:
## coxph(formula = Surv(survival_months, status) ~ age + race + 
##     x6th_stage + grade + log2(tumor_size + 1) + log2(regional_node_examined + 
##     1) + +a_stage_regional + Married + Married * log2(regional_node_examined + 
##     1), data = final)
## 
##   n= 1232, number of events= 616 
## 
##                                               coef exp(coef)  se(coef)      z
## age                                       0.018417  1.018588  0.004613  3.993
## raceOther                                -0.497477  0.608063  0.213151 -2.334
## raceWhite                                -0.348671  0.705625  0.129089 -2.701
## x6th_stageIIB                             0.361952  1.436130  0.146214  2.476
## x6th_stageIIIA                            0.698313  2.010357  0.149257  4.679
## x6th_stageIIIB                            1.196914  3.309886  0.276906  4.322
## x6th_stageIIIC                            1.446583  4.248571  0.161670  8.948
## grade                                     0.366175  1.442208  0.066117  5.538
## log2(tumor_size + 1)                      0.092912  1.097365  0.052270  1.778
## log2(regional_node_examined + 1)         -0.251965  0.777272  0.073308 -3.437
## a_stage_regional                         -0.333872  0.716146  0.195867 -1.705
## Married                                  -0.676046  0.508624  0.356283 -1.897
## log2(regional_node_examined + 1):Married  0.146154  1.157375  0.092504  1.580
##                                          Pr(>|z|)    
## age                                      6.53e-05 ***
## raceOther                                0.019600 *  
## raceWhite                                0.006913 ** 
## x6th_stageIIB                            0.013305 *  
## x6th_stageIIIA                           2.89e-06 ***
## x6th_stageIIIB                           1.54e-05 ***
## x6th_stageIIIC                            < 2e-16 ***
## grade                                    3.06e-08 ***
## log2(tumor_size + 1)                     0.075478 .  
## log2(regional_node_examined + 1)         0.000588 ***
## a_stage_regional                         0.088272 .  
## Married                                  0.057762 .  
## log2(regional_node_examined + 1):Married 0.114113    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                                          exp(coef) exp(-coef) lower .95
## age                                         1.0186     0.9818    1.0094
## raceOther                                   0.6081     1.6446    0.4004
## raceWhite                                   0.7056     1.4172    0.5479
## x6th_stageIIB                               1.4361     0.6963    1.0783
## x6th_stageIIIA                              2.0104     0.4974    1.5005
## x6th_stageIIIB                              3.3099     0.3021    1.9236
## x6th_stageIIIC                              4.2486     0.2354    3.0948
## grade                                       1.4422     0.6934    1.2669
## log2(tumor_size + 1)                        1.0974     0.9113    0.9905
## log2(regional_node_examined + 1)            0.7773     1.2866    0.6732
## a_stage_regional                            0.7161     1.3964    0.4878
## Married                                     0.5086     1.9661    0.2530
## log2(regional_node_examined + 1):Married    1.1574     0.8640    0.9655
##                                          upper .95
## age                                         1.0278
## raceOther                                   0.9234
## raceWhite                                   0.9088
## x6th_stageIIB                               1.9127
## x6th_stageIIIA                              2.6935
## x6th_stageIIIB                              5.6953
## x6th_stageIIIC                              5.8325
## grade                                       1.6418
## log2(tumor_size + 1)                        1.2157
## log2(regional_node_examined + 1)            0.8974
## a_stage_regional                            1.0513
## Married                                     1.0225
## log2(regional_node_examined + 1):Married    1.3874
## 
## Concordance= 0.673  (se = 0.012 )
## Likelihood ratio test= 247.5  on 13 df,   p=<2e-16
## Wald test            = 252.5  on 13 df,   p=<2e-16
## Score (logrank) test = 277.1  on 13 df,   p=<2e-16

Noticing married seems insignificant in every variable, we tried to add interaction term with married and all the remaining variables, but the interaction term are all insignificant as well as the main marriage effect except for the interaction between married and log2(regional_node_examined) make the married main effect being statisticlally significant under 0.05 level

cox_model5 <- coxph(Surv(survival_months, status) ~ age + race + x6th_stage + grade + 
                            log2(tumor_size+1) + log2(regional_node_examined+1)+ a_stage_regional,
                            data = final)
summary(cox_model5)
## Call:
## coxph(formula = Surv(survival_months, status) ~ age + race + 
##     x6th_stage + grade + log2(tumor_size + 1) + log2(regional_node_examined + 
##     1) + a_stage_regional, data = final)
## 
##   n= 1232, number of events= 616 
## 
##                                       coef exp(coef)  se(coef)      z Pr(>|z|)
## age                               0.018899  1.019079  0.004591  4.117 3.84e-05
## raceOther                        -0.529895  0.588667  0.211059 -2.511  0.01205
## raceWhite                        -0.379032  0.684524  0.126026 -3.008  0.00263
## x6th_stageIIB                     0.352933  1.423236  0.146071  2.416  0.01568
## x6th_stageIIIA                    0.684198  1.982182  0.149092  4.589 4.45e-06
## x6th_stageIIIB                    1.137605  3.119289  0.273191  4.164 3.13e-05
## x6th_stageIIIC                    1.439728  4.219548  0.161704  8.903  < 2e-16
## grade                             0.367059  1.443483  0.066129  5.551 2.85e-08
## log2(tumor_size + 1)              0.098186  1.103168  0.052181  1.882  0.05989
## log2(regional_node_examined + 1) -0.167119  0.846099  0.051162 -3.266  0.00109
## a_stage_regional                 -0.347075  0.706752  0.194455 -1.785  0.07428
##                                     
## age                              ***
## raceOther                        *  
## raceWhite                        ** 
## x6th_stageIIB                    *  
## x6th_stageIIIA                   ***
## x6th_stageIIIB                   ***
## x6th_stageIIIC                   ***
## grade                            ***
## log2(tumor_size + 1)             .  
## log2(regional_node_examined + 1) ** 
## a_stage_regional                 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                                  exp(coef) exp(-coef) lower .95 upper .95
## age                                 1.0191     0.9813    1.0100    1.0283
## raceOther                           0.5887     1.6988    0.3892    0.8903
## raceWhite                           0.6845     1.4609    0.5347    0.8763
## x6th_stageIIB                       1.4232     0.7026    1.0689    1.8950
## x6th_stageIIIA                      1.9822     0.5045    1.4799    2.6549
## x6th_stageIIIB                      3.1193     0.3206    1.8261    5.3284
## x6th_stageIIIC                      4.2195     0.2370    3.0734    5.7931
## grade                               1.4435     0.6928    1.2680    1.6432
## log2(tumor_size + 1)                1.1032     0.9065    0.9959    1.2220
## log2(regional_node_examined + 1)    0.8461     1.1819    0.7654    0.9353
## a_stage_regional                    0.7068     1.4149    0.4828    1.0346
## 
## Concordance= 0.67  (se = 0.012 )
## Likelihood ratio test= 242.7  on 11 df,   p=<2e-16
## Wald test            = 248.1  on 11 df,   p=<2e-16
## Score (logrank) test = 272.3  on 11 df,   p=<2e-16

Since the interaction term gives no significant significant value, we directly drop the marriage variable

cox_model6 <- coxph(Surv(survival_months, status) ~ age + race + x6th_stage + grade + 
                            log2(regional_node_examined+1),
                            data = final)
summary(cox_model6)
## Call:
## coxph(formula = Surv(survival_months, status) ~ age + race + 
##     x6th_stage + grade + log2(regional_node_examined + 1), data = final)
## 
##   n= 1232, number of events= 616 
## 
##                                       coef exp(coef)  se(coef)      z Pr(>|z|)
## age                               0.017705  1.017863  0.004567  3.877 0.000106
## raceOther                        -0.513619  0.598326  0.210909 -2.435 0.014881
## raceWhite                        -0.367476  0.692480  0.125910 -2.919 0.003517
## x6th_stageIIB                     0.461184  1.585950  0.134452  3.430 0.000603
## x6th_stageIIIA                    0.821688  2.274336  0.129512  6.345 2.23e-10
## x6th_stageIIIB                    1.413804  4.111567  0.247599  5.710 1.13e-08
## x6th_stageIIIC                    1.622178  5.064110  0.140484 11.547  < 2e-16
## grade                             0.374943  1.454908  0.066050  5.677 1.37e-08
## log2(regional_node_examined + 1) -0.173489  0.840726  0.051254 -3.385 0.000712
##                                     
## age                              ***
## raceOther                        *  
## raceWhite                        ** 
## x6th_stageIIB                    ***
## x6th_stageIIIA                   ***
## x6th_stageIIIB                   ***
## x6th_stageIIIC                   ***
## grade                            ***
## log2(regional_node_examined + 1) ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##                                  exp(coef) exp(-coef) lower .95 upper .95
## age                                 1.0179     0.9825    1.0088    1.0270
## raceOther                           0.5983     1.6713    0.3957    0.9046
## raceWhite                           0.6925     1.4441    0.5410    0.8863
## x6th_stageIIB                       1.5859     0.6305    1.2186    2.0641
## x6th_stageIIIA                      2.2743     0.4397    1.7645    2.9315
## x6th_stageIIIB                      4.1116     0.2432    2.5308    6.6798
## x6th_stageIIIC                      5.0641     0.1975    3.8452    6.6693
## grade                               1.4549     0.6873    1.2782    1.6560
## log2(regional_node_examined + 1)    0.8407     1.1894    0.7604    0.9296
## 
## Concordance= 0.668  (se = 0.012 )
## Likelihood ratio test= 236.2  on 9 df,   p=<2e-16
## Wald test            = 238.6  on 9 df,   p=<2e-16
## Score (logrank) test = 261  on 9 df,   p=<2e-16

further drop variables tumor_size and a_stage_regional due to quite low significant value.

Model Evaluation

summary(final$survival_months)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00   43.00   62.00   61.24   82.00  107.00
library(timeROC)
cox_models <- list(cox_model1, cox_model2, cox_model3, cox_model4, cox_model5, cox_model6)

time_points <- c(2,43,62,82,107)

dynamic_auc_results <- list()

for (i in 1:length(cox_models)) {
  
  current_model <- cox_models[[i]]
  time_roc <- timeROC(T = final$survival_months, 
                    delta = final$status, 
                    marker = predict(current_model, type = "lp"), 
                    cause = 1,  # Event of interest
                    times = time_points,  # Time points
                    iid = TRUE)

  
  dynamic_auc_results[[paste0("Model_", i)]] <- time_roc$AUC
  
  print(paste("AUC for Model", i, "at time points:", paste(time_points, collapse = ", ")))
  print(round(time_roc$AUC, 3))
}
## [1] "AUC for Model 1 at time points: 2, 43, 62, 82, 107"
##   t=2  t=43  t=62  t=82 t=107 
##    NA 0.746 0.721 0.724    NA 
## [1] "AUC for Model 2 at time points: 2, 43, 62, 82, 107"
##   t=2  t=43  t=62  t=82 t=107 
##    NA 0.702 0.698 0.721    NA 
## [1] "AUC for Model 3 at time points: 2, 43, 62, 82, 107"
##   t=2  t=43  t=62  t=82 t=107 
##    NA 0.703 0.701 0.723    NA 
## [1] "AUC for Model 4 at time points: 2, 43, 62, 82, 107"
##   t=2  t=43  t=62  t=82 t=107 
##    NA 0.705 0.703 0.724    NA 
## [1] "AUC for Model 5 at time points: 2, 43, 62, 82, 107"
##   t=2  t=43  t=62  t=82 t=107 
##    NA 0.700 0.700 0.722    NA 
## [1] "AUC for Model 6 at time points: 2, 43, 62, 82, 107"
##   t=2  t=43  t=62  t=82 t=107 
##    NA 0.696 0.696 0.721    NA

model 4 tends to have a relatively higher AUC throughout different time points

C-index

cox_model1$concordance
##   concordant   discordant       tied.x       tied.y      tied.xy  concordance 
## 3.518110e+05 1.553440e+05 0.000000e+00 2.219000e+03 0.000000e+00 6.936952e-01 
##          std 
## 1.132421e-02
cox_model2$concordance
##   concordant   discordant       tied.x       tied.y      tied.xy  concordance 
## 3.399160e+05 1.672380e+05 1.000000e+00 2.219000e+03 0.000000e+00 6.702418e-01 
##          std 
## 1.160463e-02
cox_model3$concordance
##   concordant   discordant       tied.x       tied.y      tied.xy  concordance 
## 3.409430e+05 1.662110e+05 1.000000e+00 2.219000e+03 0.000000e+00 6.722669e-01 
##          std 
## 1.158216e-02
cox_model4$concordance
##   concordant   discordant       tied.x       tied.y      tied.xy  concordance 
## 3.414850e+05 1.656690e+05 1.000000e+00 2.219000e+03 0.000000e+00 6.733356e-01 
##          std 
## 1.160303e-02
cox_model5$concordance
##   concordant   discordant       tied.x       tied.y      tied.xy  concordance 
## 3.398910e+05 1.672620e+05 2.000000e+00 2.219000e+03 0.000000e+00 6.701935e-01 
##          std 
## 1.163085e-02
cox_model6$concordance
##   concordant   discordant       tied.x       tied.y      tied.xy  concordance 
## 3.387630e+05 1.683460e+05 4.600000e+01 2.219000e+03 0.000000e+00 6.680127e-01 
##          std 
## 1.158565e-02

Concordance: model1 69.4 model2 67.0 model3 67.2 model4 69.6 (overall account for meeting the model assumption and the multicollinearity, model 4 have the highest concordance score, although accompanied with “married” variable that is not statistically significant) model5 67.0 model6 66.8

dev_residuals <- residuals(cox_model4, type = "deviance")
plot(dev_residuals, main = "Deviance Residuals", ylab = "Residuals", xlab = "Index")
abline(h = c(-2, 2), col = "red", lty = 2) 

surv_fit <- survfit(Surv(survival_months, status) ~ 1, data = final)

plot(surv_fit, xlab = "Time (months)", ylab = "Survival Probability", 
     main = "Survival Curve for the Final Model", col = "blue", lwd = 2)

grid()